
****************************
*  DESCRIPTIVE STATISTICS  *
****************************
* Labussiere 2023


/* Produce the following outputs:
	In main text:
		Table 1
	
	In supplementary materials:
		Figures S1 S2 S3 S4 S5
		Tables S1 S3 S4	
*/

/* The descriptive statistics are calculated on two samples:
 - the analytical sample, ie. second-generation students enrolled in primary 
   education between 2008 and 2015, not enrolled in practical education or in
   institutional households (hereafter the "full sample")
 - the group of exposure discordant families, i.e. families from the full sample
   where siblings have acquired Dutch citizenship at a different age 
   (hereafter the "SFE sample" )
 */

		***************
		* FULL SAMPLE * FIGURES S1, S3, S4 | TABLES 1, S1, S3, S4
		***************
		
use full_sample.dta, clear

* Identify those whose sociodemographic characteristics are not missing
mark non_missing
markout non_missing gender first_born_FULREG date_birth_cat parent_ed_level_miss ///
homeowner_alo_cito q3_st_disp_income_cito secm_ma_agg2_cito secm_pa_agg2_cito ///
country_of_birth_ma_agg type_household_cito nbr_children_cito
tab non_missing


******************************************************************
* TABLE 1
* Age at naturalisation
******************************************************************
tab age_nat_13_cat7 if non_missing, miss
tab age_nat_13_cat7 if non_missing & sum_nat_t5 != .b, miss


******************************************************************
* TABLE S1
* Age at the time of the Cito test
******************************************************************
tab age_cito


******************************************************************
* TABLE S3
* Descriptive statistics all covaritates on the regression sample
******************************************************************
gen same_country_mapa = (gbaherkomstgroepering_ma == gbaherkomstgroepering_pa)

local control_variables age_nat_13_cat7 gender date_birth_cat first_born_FULREG ///
						secm_ma_agg2_cito secm_pa_agg2_cito homeowner_alo_cito ///
					    nbr_children_cito type_household_cito parent_ed_level_miss ///
						country_of_birth_ma_agg q3_st_disp_income_cito ///
						date_birth_educ sum_nat_t5  ///
					    track_4_cito date_nat_t5 same_country_mapa ///

* On regression sample (non_missing=1)
foreach var in `control_variables' {
tab `var' if non_missing
}


******************************************************************
* TABLE S4
* Distribution of the age at naturalisation
******************************************************************
tab age_nat_t5_wmiss if non_missing, miss


******************************************************************
* FIGURE S1
* Distribution of the age difference between siblings
******************************************************************
bys sibling_id_ma: egen min_date_bith = min(date_birth_educ)
bys sibling_id_ma: egen max_date_birth = max(date_birth_educ)
gen age_diff = max_date_birth - min_date_bith
tab age_diff, miss
tab age_diff if nbr_siblings_ma_CITO > 1
tab age_diff if nbr_siblings_ma_CITO > 1 & non_missing
* 72.5% of siblings have an age-gap of four years or less

hist age_diff if nbr_siblings_ma_CITO > 1 & non_missing, percent ///
xtitle("Maximum age gap between siblings", height(4)) ///
ytitle("Percentage (%)", height(6)) graphregion(color(white)) ///
color(red%30) xtick(0(1)11) width(1) discrete
graph export FigureS1.png, replace


******************************************************************
* FIGURE S3
* Average Cito score by age at naturalisation categories
******************************************************************
sum cito_score_st_year 
egen mean_cito_c = mean(cito_score_st_year), by(age_nat_13_cat7)
egen tag_age_natc = tag(age_nat_13_cat7)

twoway connected mean_cito_c  age_nat_13_cat7 if tag_age_natc, sort ///
xtitle("Age at naturalisation", height(5)) graphregion(color(white)) ///
xtick(0(1)7) xlabel(0 "0" 1 "1-2" 2 "3-4" 3 "5-6" 4 "7-10" 5 "11-13" 6 "14+")  ///
ytitle("Standardised Cito test score", height(6)) color("0 68 136")
graph export FigureS3.png, replace


******************************************************************
* FIGURE S4
* Average Cito score by age at naturalisation dummies
******************************************************************
egen mean_cito_d = mean(cito_score_st_year), by(age_nat_13_censored)
egen tag_age_natd = tag(age_nat_13_censored)

twoway connected mean_cito_d age_nat_13_censored if tag_age_natd, sort ///
xtitle("Age at naturalisation", height(5)) graphregion(color(white)) ///
xtick(0(1)15) color("0 68 136") ///
ytitle("Standardised Cito test score", height(6)) 
graph export FigureS4.png, replace



		**************
		* SFE SAMPLE * FIGURES S2, S5
		**************

use sfe_sample.dta, clear

******************************************************************
* FIGURE S2
* distribution age difference between siblings
******************************************************************
bys sibling_id_ma: egen min_date_bith = min(date_birth_educ)
bys sibling_id_ma: egen max_date_birth = max(date_birth_educ)
gen age_diff = max_date_birth - min_date_bith
tab age_diff, miss
* 60.3% of siblings have an age-gap of four years or less

hist age_diff, percent ///
xtitle("Maximum age gap between siblings", height(4)) ///
ytitle("Percentage (%)", height(6)) graphregion(color(white)) ///
color(red%30) xtick(0(1)11) width(1) discrete
graph export FigureS2.png, replace
* some students are either 0 or more than seven years apart because one or 
* both sibling(s) repeated or skipped a grade in primary education

* distribution age difference between siblings at the time of Cito
bys sibling_id_ma: egen min_date_cito = min(year_cito)
bys sibling_id_ma: egen max_date_cito = max(year_cito)
gen age_diff_cito = max_date_cito - min_date_cito
* some siblings take the cito test at the same date but at a different age 
* 65.6% of siblings have an age-gap of four years or less


******************************************************************
* FIGURE S5
* histogram of the (maximum) difference in age at naturalisation
* between siblings
******************************************************************
sort sibling_id_ma rinpersoon

bys sibling_id_ma: egen min_age_at_nat = min(age_nat_13_censored)
bys sibling_id_ma: egen max_age_at_nat = max(age_nat_13_censored)
bys sibling_id_ma: gen max_diff_age_at_nat = max_age_at_nat - min_age_at_nat

hist max_diff_age_at_nat if tag_ma, percent ///
xtitle("Maximum difference in age at naturalisation between siblings", height(4)) ///
ytitle("Percentage (%)", height(6)) graphregion(color(white)) ///
xtick(1(1)16) color(red%30) width(1) discrete
graph export FigureS5.png, replace 


